In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import pdist, squareform
import warnings
warnings.filterwarnings('ignore')

class DBSCANHACExperimentSuite:
    """
    DBSCAN and HAC analysis suite
    Includes: Eps-MinPts optimization, Density analysis, Linkage methods comparison, Dendrograms
    """
    
    def __init__(self, eps_range=(0.1, 2.0), eps_steps=20, min_samples_range=(3, 20)):
        self.eps_range = eps_range
        self.eps_steps = eps_steps
        self.min_samples_range = min_samples_range
        self.results = {}
        self.linkage_methods = ['ward', 'complete', 'average', 'single']
        
    def load_datasets(self):
        """Load and prepare the three datasets from your notebook"""
        print("Loading datasets...")
        
        self.datasets = {}
        
        # Dataset 1: Small collision dataset (classification features)
        try:
            injury_df = pd.read_csv('scripts/punt_analytics/injury_collision_features.csv')
            normal_df = pd.read_csv('scripts/punt_analytics/normal_collision_features.csv')
            collision_df = pd.concat([injury_df, normal_df], ignore_index=True)
            
            # Remove non-numeric and identifier columns for clustering
            exclude_cols = ['season_year', 'gamekey', 'playid', 'injured_player', 
                          'partner_player', 'impact_type', 'player_activity', 
                          'partner_activity', 'friendly_fire', 'is_injury']
            
            collision_features = collision_df.select_dtypes(include=[np.number]).columns
            collision_features = [col for col in collision_features if col not in exclude_cols]
            
            collision_data = collision_df[collision_features].dropna()
            
            if len(collision_data) > 0:
                self.datasets['collision'] = {
                    'data': collision_data,
                    'name': 'NFL Collision Analysis',
                    'description': f'Player collision biomechanics ({len(collision_data)} samples)',
                    'target': collision_df['is_injury'].dropna() if 'is_injury' in collision_df.columns else None
                }
                print(f"Collision dataset loaded: {collision_data.shape}")
            else:
                print("Warning: Collision dataset is empty after cleaning")
            
        except Exception as e:
            print(f"Could not load collision dataset: {e}")
        
        # Dataset 2: Large injury prediction dataset
        try:
            play_surface_df = pd.read_csv('scripts/play_surface/processed_motion_features.csv')
            
            # Select relevant numeric features for clustering
            numeric_cols = play_surface_df.select_dtypes(include=[np.number]).columns
            exclude_cols = ['injury', 'PlayKey', 'PlayerDay', 'PlayerGame', 'DM_M1', 'DM_M7', 'DM_M28', 'DM_M42']
            feature_cols = [col for col in numeric_cols if col not in exclude_cols]
            
            # Sample for computational efficiency
            sample_size = min(5000, len(play_surface_df))
            sampled_data = play_surface_df.sample(n=sample_size, random_state=42)
            
            injury_data = sampled_data[feature_cols].dropna()
            
            if len(injury_data) > 0:
                self.datasets['injury_prediction'] = {
                    'data': injury_data,
                    'name': 'NFL Injury Prediction',
                    'description': f'Player movement patterns (sampled {len(injury_data)} from {len(play_surface_df)})',
                    'target': sampled_data['injury'].dropna() if 'injury' in sampled_data.columns else None
                }
                print(f"Injury prediction dataset loaded: {injury_data.shape}")
            else:
                print("Warning: Injury prediction dataset is empty after cleaning")
            
        except Exception as e:
            print(f"Could not load injury prediction dataset: {e}")
        
        # Dataset 3: Big Data Bowl regression dataset
        try:
            X_train = pd.read_csv('scripts/big_data_bowl/X_train.csv')
            y_train = pd.read_csv('scripts/big_data_bowl/y_train.csv')
            
            print(f"Big Data Bowl raw data shape: {X_train.shape}")
            
            # Check if data exists and is valid
            if len(X_train) == 0:
                print("Warning: X_train is empty")
                raise ValueError("Empty X_train dataset")
            
            # Select only numeric columns and remove any with all NaN values
            numeric_data = X_train.select_dtypes(include=[np.number])
            print(f"Numeric columns found: {len(numeric_data.columns)}")
            
            # Remove columns with all NaN or constant values
            numeric_data = numeric_data.dropna(axis=1, how='all')  # Remove columns with all NaN
            numeric_data = numeric_data.loc[:, numeric_data.var() > 1e-8]  # Remove constant columns
            
            print(f"After removing constant/NaN columns: {numeric_data.shape}")
            
            # Remove rows with any NaN values
            clean_data = numeric_data.dropna()
            print(f"After removing NaN rows: {clean_data.shape}")
            
            if len(clean_data) < 10:
                print(f"Warning: Too few samples ({len(clean_data)}) in Big Data Bowl dataset")
                raise ValueError("Insufficient data after cleaning")
            
            # Sample for efficiency
            sample_size = min(3000, len(clean_data))
            if sample_size < len(clean_data):
                sample_idx = np.random.choice(len(clean_data), sample_size, replace=False)
                sampled_data = clean_data.iloc[sample_idx]
            else:
                sampled_data = clean_data
            
            print(f"Final sampled data shape: {sampled_data.shape}")
            
            # Prepare target if available
            target = None
            if len(y_train) > 0 and len(y_train.columns) > 0:
                target_values = y_train.iloc[:len(sampled_data)]
                if len(y_train.columns) == 1:
                    target = target_values.iloc[:, 0].values
                    print(f"Target variable loaded: {len(target)} values")
            
            self.datasets['big_data_bowl'] = {
                'data': sampled_data,
                'name': 'NFL Big Data Bowl',
                'description': f'Play outcome prediction features ({len(sampled_data)} samples)',
                'target': target
            }
            print(f"Big Data Bowl dataset loaded successfully: {sampled_data.shape}")
            
        except Exception as e:
            print(f"Could not load Big Data Bowl dataset: {e}")
        
        if not self.datasets:
            print("Error: No datasets were successfully loaded!")
            return None
        
        print(f"Successfully loaded {len(self.datasets)} datasets")
        return self.datasets

    def validate_dataset(self, dataset_key, min_samples=10):
        """Validate that dataset has sufficient data for clustering"""
        if dataset_key not in self.datasets:
            print(f"Dataset {dataset_key} not found")
            return False
        
        data = self.datasets[dataset_key]['data']
        
        if len(data) < min_samples:
            print(f"Dataset {dataset_key} has insufficient samples: {len(data)} < {min_samples}")
            return False
        
        if data.shape[1] < 2:
            print(f"Dataset {dataset_key} has insufficient features: {data.shape[1]} < 2")
            return False
        
        # Check for variance
        if np.any(data.var() < 1e-8):
            print(f"Warning: Dataset {dataset_key} has features with very low variance")
        
        return True

    def experiment_1_dbscan_parameter_optimization(self, dataset_key):
        """
        Experiment 1: DBSCAN Parameter Optimization
        Find optimal eps and min_samples using k-distance graph and grid search
        """
        print(f"\nEXPERIMENT 1: DBSCAN Parameter Optimization - {self.datasets[dataset_key]['name']}")
        print("="*80)
        
        if not self.validate_dataset(dataset_key):
            print(f"Skipping DBSCAN optimization for {dataset_key} due to insufficient data")
            return None
        
        data = self.datasets[dataset_key]['data']
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(data)
        
        print(f"Data shape for DBSCAN: {data_scaled.shape}")
        
        # K-distance graph for eps estimation
        print("Computing k-distance graph for eps estimation...")
        k_values = range(3, min(21, len(data_scaled)//10, len(data_scaled)))
        k_distances = {}
        
        for k in k_values:
            if k >= len(data_scaled):
                continue
            nn = NearestNeighbors(n_neighbors=k)
            nn.fit(data_scaled)
            distances, _ = nn.kneighbors(data_scaled)
            k_distances[k] = np.sort(distances[:, k-1])[::-1]  # Sort in descending order
        
        # Grid search for optimal parameters
        print("Grid search for optimal DBSCAN parameters...")
        eps_values = np.linspace(self.eps_range[0], self.eps_range[1], self.eps_steps)
        min_samples_values = range(self.min_samples_range[0], 
                                 min(self.min_samples_range[1]+1, len(data_scaled)//2))
        
        grid_results = []
        best_score = -1
        best_params = None
        
        for eps in eps_values:
            for min_samples in min_samples_values:
                try:
                    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                    labels = dbscan.fit_predict(data_scaled)
                    
                    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
                    n_noise = list(labels).count(-1)
                    
                    # Calculate metrics only if we have valid clusters
                    if n_clusters > 1:
                        try:
                            # Remove noise points for silhouette calculation
                            non_noise_mask = labels != -1
                            if np.sum(non_noise_mask) > 1:
                                sil_score = silhouette_score(data_scaled[non_noise_mask], labels[non_noise_mask])
                            else:
                                sil_score = -1
                        except:
                            sil_score = -1
                    else:
                        sil_score = -1
                    
                    grid_results.append({
                        'eps': eps,
                        'min_samples': min_samples,
                        'n_clusters': n_clusters,
                        'n_noise': n_noise,
                        'noise_ratio': n_noise / len(labels),
                        'silhouette_score': sil_score
                    })
                    
                    # Update best parameters
                    if sil_score > best_score and n_clusters > 1:
                        best_score = sil_score
                        best_params = {'eps': eps, 'min_samples': min_samples}
                        
                except Exception as e:
                    print(f"Error with eps={eps:.2f}, min_samples={min_samples}: {e}")
                    continue
        
        if not grid_results:
            print("No valid DBSCAN results found")
            return None
        
        # Convert to DataFrame for analysis
        grid_df = pd.DataFrame(grid_results)
        
        # Store results
        self.results[dataset_key] = self.results.get(dataset_key, {})
        self.results[dataset_key]['dbscan_optimization'] = {
            'k_distances': k_distances,
            'grid_results': grid_df,
            'best_params': best_params,
            'best_score': best_score
        }
        
        # Visualization
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle(f'DBSCAN Parameter Optimization - {self.datasets[dataset_key]["name"]}', fontsize=16)
        
        # K-distance graph
        for k in [4, 6, 8, 10]:
            if k in k_distances:
                axes[0, 0].plot(k_distances[k], label=f'k={k}')
        
        axes[0, 0].set_xlabel('Points sorted by distance')
        axes[0, 0].set_ylabel('k-distance')
        axes[0, 0].set_title('K-distance Graph for Eps Estimation')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3)
        
        # Silhouette score heatmap
        if len(grid_df) > 0:
            pivot_sil = grid_df.pivot_table(index='min_samples', columns='eps', values='silhouette_score', fill_value=-1)
            sns.heatmap(pivot_sil, annot=False, cmap='viridis', ax=axes[0, 1])
            axes[0, 1].set_title('Silhouette Score Heatmap')
            
            # Number of clusters heatmap
            pivot_clusters = grid_df.pivot_table(index='min_samples', columns='eps', values='n_clusters', fill_value=0)
            # Convert to int for proper formatting
            pivot_clusters = pivot_clusters.round().astype(int)
            sns.heatmap(pivot_clusters, annot=True, fmt='d', cmap='plasma', ax=axes[0, 2])
            axes[0, 2].set_title('Number of Clusters')
            
            # Noise ratio heatmap
            pivot_noise = grid_df.pivot_table(index='min_samples', columns='eps', values='noise_ratio', fill_value=1)
            sns.heatmap(pivot_noise, annot=False, cmap='Reds', ax=axes[1, 0])
            axes[1, 0].set_title('Noise Ratio')
            
            # Parameter relationships
            axes[1, 1].scatter(grid_df['eps'], grid_df['silhouette_score'], 
                              c=grid_df['min_samples'], cmap='viridis', alpha=0.6)
            axes[1, 1].set_xlabel('Eps')
            axes[1, 1].set_ylabel('Silhouette Score')
            axes[1, 1].set_title('Eps vs Silhouette Score')
            plt.colorbar(axes[1, 1].collections[0], ax=axes[1, 1], label='min_samples')
        
        # Best clustering result
        if best_params:
            best_dbscan = DBSCAN(eps=best_params['eps'], min_samples=best_params['min_samples'])
            best_labels = best_dbscan.fit_predict(data_scaled)
            
            if data_scaled.shape[1] > 2:
                pca = PCA(n_components=2, random_state=42)
                data_pca = pca.fit_transform(data_scaled)
            else:
                data_pca = data_scaled
            
            scatter = axes[1, 2].scatter(data_pca[:, 0], data_pca[:, 1], c=best_labels, 
                                       cmap='viridis', alpha=0.6, s=50)
            axes[1, 2].set_xlabel('First Principal Component')
            axes[1, 2].set_ylabel('Second Principal Component')
            axes[1, 2].set_title(f'Best DBSCAN Clustering\n(eps={best_params["eps"]:.2f}, min_samples={best_params["min_samples"]})')
            plt.colorbar(scatter, ax=axes[1, 2])
        
        plt.tight_layout()
        plt.show()
        
        if best_params:
            print(f"Optimal DBSCAN parameters:")
            print(f"   Eps: {best_params['eps']:.3f}")
            print(f"   Min_samples: {best_params['min_samples']}")
            print(f"   Best silhouette score: {best_score:.3f}")
        else:
            print("No valid clustering found in parameter space")
        
        return best_params

    def experiment_2_density_analysis(self, dataset_key):
        """
        Experiment 2: Density Analysis and Core Points Investigation
        Analyze density distribution and core points characteristics
        """
        print(f"\nEXPERIMENT 2: Density Analysis - {self.datasets[dataset_key]['name']}")
        print("="*80)
        
        if not self.validate_dataset(dataset_key):
            print(f"Skipping density analysis for {dataset_key} due to insufficient data")
            return None
        
        data = self.datasets[dataset_key]['data']
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(data)
        
        # Get optimal parameters from previous experiment
        best_params = self.results.get(dataset_key, {}).get('dbscan_optimization', {}).get('best_params')
        if not best_params:
            best_params = {'eps': 0.5, 'min_samples': min(5, len(data_scaled)//10)}
            print(f"Using default parameters: eps={best_params['eps']}, min_samples={best_params['min_samples']}")
        
        # Fit DBSCAN with optimal parameters
        dbscan = DBSCAN(eps=best_params['eps'], min_samples=best_params['min_samples'])
        labels = dbscan.fit_predict(data_scaled)
        
        # Density analysis
        print("Analyzing point densities and core points...")
        
        # Find core points
        core_samples_mask = np.zeros_like(labels, dtype=bool)
        if hasattr(dbscan, 'core_sample_indices_') and len(dbscan.core_sample_indices_) > 0:
            core_samples_mask[dbscan.core_sample_indices_] = True
        
        # Calculate local density for each point
        nn = NearestNeighbors(radius=best_params['eps'])
        nn.fit(data_scaled)
        neighbors = nn.radius_neighbors(data_scaled, return_distance=False)
        local_densities = [len(neighbor_list) for neighbor_list in neighbors]
        
        # Classify points
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)
        n_core = len(dbscan.core_sample_indices_) if hasattr(dbscan, 'core_sample_indices_') else 0
        n_border = len(data_scaled) - n_core - n_noise
        
        # Analyze cluster densities
        cluster_stats = {}
        unique_labels = set(labels)
        
        for label in unique_labels:
            if label == -1:  # Noise points
                continue
                
            cluster_mask = labels == label
            cluster_points = data_scaled[cluster_mask]
            cluster_core_mask = core_samples_mask[cluster_mask]
            cluster_densities = np.array(local_densities)[cluster_mask]
            
            cluster_stats[label] = {
                'size': np.sum(cluster_mask),
                'n_core_points': np.sum(cluster_core_mask),
                'avg_density': np.mean(cluster_densities),
                'std_density': np.std(cluster_densities),
                'min_density': np.min(cluster_densities),
                'max_density': np.max(cluster_densities)
            }
        
        # Store results
        self.results[dataset_key]['density_analysis'] = {
            'labels': labels,
            'core_samples_mask': core_samples_mask,
            'local_densities': local_densities,
            'cluster_stats': cluster_stats,
            'n_clusters': n_clusters,
            'n_noise': n_noise,
            'n_core': n_core,
            'n_border': n_border
        }
        
        # Visualization
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle(f'DBSCAN Density Analysis - {self.datasets[dataset_key]["name"]}', fontsize=16)
        
        # Point type distribution
        point_types = ['Core', 'Border', 'Noise']
        point_counts = [n_core, n_border, n_noise]
        colors = ['green', 'orange', 'red']
        
        axes[0, 0].pie(point_counts, labels=point_types, colors=colors, autopct='%1.1f%%', startangle=90)
        axes[0, 0].set_title('Point Type Distribution')
        
        # Density distribution by point type
        core_densities = [local_densities[i] for i in range(len(local_densities)) if core_samples_mask[i]]
        border_densities = [local_densities[i] for i in range(len(local_densities)) 
                           if not core_samples_mask[i] and labels[i] != -1]
        noise_densities = [local_densities[i] for i in range(len(local_densities)) if labels[i] == -1]
        
        # Only plot non-empty lists
        density_lists = []
        density_labels = []
        density_colors = []
        
        if core_densities:
            density_lists.append(core_densities)
            density_labels.append('Core')
            density_colors.append('green')
        if border_densities:
            density_lists.append(border_densities)
            density_labels.append('Border')
            density_colors.append('orange')
        if noise_densities:
            density_lists.append(noise_densities)
            density_labels.append('Noise')
            density_colors.append('red')
        
        if density_lists:
            axes[0, 1].hist(density_lists, bins=20, alpha=0.7, label=density_labels, color=density_colors)
            axes[0, 1].set_xlabel('Local Density')
            axes[0, 1].set_ylabel('Frequency')
            axes[0, 1].set_title('Density Distribution by Point Type')
            axes[0, 1].legend()
            axes[0, 1].grid(True, alpha=0.3)
        
        # Cluster size vs average density
        if cluster_stats:
            cluster_sizes = [stats['size'] for stats in cluster_stats.values()]
            cluster_avg_densities = [stats['avg_density'] for stats in cluster_stats.values()]
            
            axes[0, 2].scatter(cluster_sizes, cluster_avg_densities, s=100, alpha=0.7, c='blue')
            axes[0, 2].set_xlabel('Cluster Size')
            axes[0, 2].set_ylabel('Average Density')
            axes[0, 2].set_title('Cluster Size vs Average Density')
            axes[0, 2].grid(True, alpha=0.3)
            
            # Add cluster labels
            for i, (size, density) in enumerate(zip(cluster_sizes, cluster_avg_densities)):
                axes[0, 2].annotate(f'C{i}', (size, density), xytext=(5, 5), 
                                   textcoords='offset points', fontsize=8)
        
        # Clustering visualization with point types
        if data_scaled.shape[1] > 2:
            pca = PCA(n_components=2, random_state=42)
            data_pca = pca.fit_transform(data_scaled)
        else:
            data_pca = data_scaled
        
        # Core points
        if np.any(core_samples_mask):
            core_points = data_pca[core_samples_mask]
            axes[1, 0].scatter(core_points[:, 0], core_points[:, 1], c=labels[core_samples_mask], 
                              cmap='viridis', s=50, alpha=0.8, label='Core', marker='o')
        
        # Border points
        border_mask = (~core_samples_mask) & (labels != -1)
        if np.any(border_mask):
            border_points = data_pca[border_mask]
            axes[1, 0].scatter(border_points[:, 0], border_points[:, 1], c=labels[border_mask], 
                              cmap='viridis', s=30, alpha=0.6, label='Border', marker='s')
        
        # Noise points
        noise_mask = labels == -1
        if np.any(noise_mask):
            noise_points = data_pca[noise_mask]
            axes[1, 0].scatter(noise_points[:, 0], noise_points[:, 1], c='red', 
                              s=20, alpha=0.5, label='Noise', marker='x')
        
        axes[1, 0].set_xlabel('First Principal Component')
        axes[1, 0].set_ylabel('Second Principal Component')
        axes[1, 0].set_title('DBSCAN Clustering with Point Types')
        axes[1, 0].legend()
        
        # Density heatmap
        from scipy.stats import gaussian_kde
        if len(data_pca) > 10:
            try:
                kde = gaussian_kde(data_pca.T)
                x_min, x_max = data_pca[:, 0].min(), data_pca[:, 0].max()
                y_min, y_max = data_pca[:, 1].min(), data_pca[:, 1].max()
                
                xx, yy = np.mgrid[x_min:x_max:50j, y_min:y_max:50j]
                positions = np.vstack([xx.ravel(), yy.ravel()])
                density = np.reshape(kde(positions).T, xx.shape)
                
                im = axes[1, 1].contourf(xx, yy, density, cmap='Blues', alpha=0.6)
                axes[1, 1].scatter(data_pca[:, 0], data_pca[:, 1], c=labels, cmap='viridis', s=30, alpha=0.8)
                plt.colorbar(im, ax=axes[1, 1])
                axes[1, 1].set_title('Data Density Heatmap')
            except:
                axes[1, 1].scatter(data_pca[:, 0], data_pca[:, 1], c=labels, cmap='viridis', s=30, alpha=0.8)
                axes[1, 1].set_title('Clustering Result')
        
        # Cluster statistics
        if cluster_stats:
            cluster_ids = list(cluster_stats.keys())
            avg_densities = [cluster_stats[cid]['avg_density'] for cid in cluster_ids]
            std_densities = [cluster_stats[cid]['std_density'] for cid in cluster_ids]
            
            axes[1, 2].bar(cluster_ids, avg_densities, yerr=std_densities, capsize=5,
                          alpha=0.7, color='skyblue', edgecolor='black')
            axes[1, 2].set_xlabel('Cluster ID')
            axes[1, 2].set_ylabel('Average Density ± Std')
            axes[1, 2].set_title('Cluster Density Statistics')
            axes[1, 2].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        print(f"Density Analysis Results:")
        print(f"   Total points: {len(data_scaled)}")
        print(f"   Core points: {n_core} ({n_core/len(data_scaled)*100:.1f}%)")
        print(f"   Border points: {n_border} ({n_border/len(data_scaled)*100:.1f}%)")
        print(f"   Noise points: {n_noise} ({n_noise/len(data_scaled)*100:.1f}%)")
        print(f"   Number of clusters: {n_clusters}")
        
        if cluster_stats:
            print(f"Cluster density statistics:")
            for cid, stats in cluster_stats.items():
                print(f"     Cluster {cid}: {stats['size']} points, "
                      f"density = {stats['avg_density']:.1f}±{stats['std_density']:.1f}")

    def experiment_3_hierarchical_clustering_linkages(self, dataset_key):
        """
        Experiment 3: Hierarchical Agglomerative Clustering with Different Linkage Methods
        Compare ward, complete, average, and single linkage methods
        """
        print(f"\nEXPERIMENT 3: Hierarchical Clustering Linkages - {self.datasets[dataset_key]['name']}")
        print("="*80)
        
        if not self.validate_dataset(dataset_key, min_samples=4):
            print(f"Skipping hierarchical clustering for {dataset_key} due to insufficient data")
            return None
        
        data = self.datasets[dataset_key]['data']
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(data)
        
        # Sample data if too large for hierarchical clustering
        max_samples = 1000
        if len(data_scaled) > max_samples:
            print(f"Sampling {max_samples} points for hierarchical clustering (original: {len(data_scaled)})")
            sample_idx = np.random.choice(len(data_scaled), max_samples, replace=False)
            data_scaled = data_scaled[sample_idx]
        
        if len(data_scaled) < 4:
            print(f"Insufficient data for hierarchical clustering: {len(data_scaled)} samples")
            return None
        
        # Test different numbers of clusters
        n_clusters_range = range(2, min(21, len(data_scaled)//2))
        linkage_results = {}
        
        print("Testing different linkage methods...")
        
        for linkage_method in self.linkage_methods:
            print(f"   Testing {linkage_method} linkage...")
            
            # Skip ward for non-euclidean distances
            if linkage_method == 'ward':
                metric = 'euclidean'
            else:
                metric = 'euclidean'  # Can be changed to other metrics for non-ward methods
            
            try:
                # Compute linkage matrix for dendrogram
                linkage_matrix = linkage(data_scaled, method=linkage_method, metric=metric)
                
                # Test different numbers of clusters
                silhouette_scores = []
                cluster_assignments = {}
                
                for n_clusters in n_clusters_range:
                    try:
                        hac = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage_method)
                        labels = hac.fit_predict(data_scaled)
                        
                        if len(set(labels)) > 1:
                            sil_score = silhouette_score(data_scaled, labels)
                            silhouette_scores.append(sil_score)
                            cluster_assignments[n_clusters] = labels
                        else:
                            silhouette_scores.append(-1)
                            cluster_assignments[n_clusters] = labels
                    except Exception as e:
                        print(f"     Error with {n_clusters} clusters: {e}")
                        silhouette_scores.append(-1)
                        cluster_assignments[n_clusters] = np.zeros(len(data_scaled))
                
                if silhouette_scores and max(silhouette_scores) > -1:
                    optimal_k = n_clusters_range[np.argmax(silhouette_scores)]
                    best_score = max(silhouette_scores)
                    
                    linkage_results[linkage_method] = {
                        'linkage_matrix': linkage_matrix,
                        'silhouette_scores': silhouette_scores,
                        'optimal_k': optimal_k,
                        'best_score': best_score,
                        'cluster_assignments': cluster_assignments,
                        'n_clusters_range': list(n_clusters_range)
                    }
                else:
                    print(f"     No valid results for {linkage_method}")
                
            except Exception as e:
                print(f"     Error with {linkage_method}: {e}")
                continue
        
        if not linkage_results:
            print("No valid hierarchical clustering results found")
            return None
        
        # Store results
        self.results[dataset_key]['hac_linkages'] = linkage_results
        
        # Visualization
        n_methods = len(linkage_results)
        fig, axes = plt.subplots(3, n_methods, figsize=(5*n_methods, 15))
        if n_methods == 1:
            axes = axes.reshape(3, 1)
        
        fig.suptitle(f'Hierarchical Clustering Linkage Comparison - {self.datasets[dataset_key]["name"]}', 
                    fontsize=16)
        
        for i, (method, results) in enumerate(linkage_results.items()):
            # Dendrogram
            try:
                dendrogram(results['linkage_matrix'], ax=axes[0, i], truncate_mode='level', p=10)
                axes[0, i].set_title(f'{method.title()} Linkage Dendrogram')
                axes[0, i].set_xlabel('Sample Index or (Cluster Size)')
                axes[0, i].set_ylabel('Distance')
            except:
                axes[0, i].text(0.5, 0.5, f'Dendrogram error\nfor {method}', 
                               ha='center', va='center', transform=axes[0, i].transAxes)
                axes[0, i].set_title(f'{method.title()} Linkage (Error)')
            
            # Silhouette scores
            axes[1, i].plot(results['n_clusters_range'], results['silhouette_scores'], 'bo-', linewidth=2)
            axes[1, i].axvline(x=results['optimal_k'], color='red', linestyle='--', alpha=0.7,
                              label=f"Optimal k={results['optimal_k']}")
            axes[1, i].set_xlabel('Number of Clusters')
            axes[1, i].set_ylabel('Silhouette Score')
            axes[1, i].set_title(f'{method.title()} - Silhouette Analysis')
            axes[1, i].grid(True, alpha=0.3)
            axes[1, i].legend()
            
            # Optimal clustering visualization
            optimal_labels = results['cluster_assignments'][results['optimal_k']]
            
            if data_scaled.shape[1] > 2:
                pca = PCA(n_components=2, random_state=42)
                data_pca = pca.fit_transform(data_scaled)
            else:
                data_pca = data_scaled
            
            scatter = axes[2, i].scatter(data_pca[:, 0], data_pca[:, 1], c=optimal_labels, 
                                       cmap='viridis', alpha=0.6, s=50)
            axes[2, i].set_xlabel('First Principal Component')
            axes[2, i].set_ylabel('Second Principal Component')
            axes[2, i].set_title(f'{method.title()} - Optimal Clustering (k={results["optimal_k"]})')
            plt.colorbar(scatter, ax=axes[2, i])
        
        plt.tight_layout()
        plt.show()
        
        # Performance comparison
        print(f"Linkage Method Performance Comparison:")
        print("-" * 60)
        print(f"{'Method':<12} {'Optimal k':<10} {'Best Silhouette':<15}")
        print("-" * 60)
        
        best_method = None
        best_overall_score = -1
        
        for method, results in linkage_results.items():
            print(f"{method.title():<12} {results['optimal_k']:<10} {results['best_score']:<15.3f}")
            if results['best_score'] > best_overall_score:
                best_overall_score = results['best_score']
                best_method = method
        
        print(f"\nBest linkage method: {best_method.title()} (score: {best_overall_score:.3f})")
        
        return linkage_results

    def experiment_4_dendrogram_analysis(self, dataset_key):
        """
        Experiment 4: Detailed Dendrogram Analysis
        In-depth analysis of hierarchical clustering structure and cut heights
        """
        print(f"\nEXPERIMENT 4: Dendrogram Analysis - {self.datasets[dataset_key]['name']}")
        print("="*80)
        
        if not self.validate_dataset(dataset_key, min_samples=4):
            print(f"Skipping dendrogram analysis for {dataset_key} due to insufficient data")
            return None
        
        data = self.datasets[dataset_key]['data']
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(data)
        
        # Sample data if necessary
        max_samples = 500  # Smaller sample for detailed dendrogram analysis
        if len(data_scaled) > max_samples:
            print(f"Sampling {max_samples} points for detailed dendrogram analysis")
            sample_idx = np.random.choice(len(data_scaled), max_samples, replace=False)
            data_scaled = data_scaled[sample_idx]
        
        if len(data_scaled) < 4:
            print(f"Insufficient data for dendrogram analysis: {len(data_scaled)} samples")
            return None
        
        # Get best linkage method from previous experiment
        hac_results = self.results.get(dataset_key, {}).get('hac_linkages', {})
        if hac_results:
            best_method = max(hac_results.keys(), key=lambda x: hac_results[x]['best_score'])
        else:
            best_method = 'ward'
        
        print(f"Using {best_method} linkage for detailed analysis...")
        
        try:
            # Compute linkage matrix
            linkage_matrix = linkage(data_scaled, method=best_method)
            
            # Analyze different cut heights
            max_distance = linkage_matrix[:, 2].max()
            min_distance = linkage_matrix[:, 2].min()
            
            cut_heights = np.linspace(min_distance, max_distance * 0.8, 10)
            cut_analysis = []
            
            for height in cut_heights:
                labels = fcluster(linkage_matrix, height, criterion='distance')
                n_clusters = len(set(labels))
                
                if n_clusters > 1:
                    try:
                        sil_score = silhouette_score(data_scaled, labels)
                    except:
                        sil_score = -1
                else:
                    sil_score = -1
                
                cut_analysis.append({
                    'height': height,
                    'n_clusters': n_clusters,
                    'silhouette_score': sil_score
                })
            
            cut_df = pd.DataFrame(cut_analysis)
            
            # Find optimal cut height
            valid_cuts = cut_df[cut_df['silhouette_score'] > -1]
            if len(valid_cuts) > 0:
                optimal_cut = valid_cuts.loc[valid_cuts['silhouette_score'].idxmax()]
            else:
                optimal_cut = cut_df.iloc[len(cut_df)//2]  # Middle cut
            
            # Distance matrix analysis
            distance_matrix = squareform(pdist(data_scaled))
            
            # Store results
            self.results[dataset_key]['dendrogram_analysis'] = {
                'linkage_matrix': linkage_matrix,
                'cut_analysis': cut_df,
                'optimal_cut': optimal_cut,
                'distance_matrix': distance_matrix,
                'best_linkage_method': best_method
            }
            
            # Comprehensive visualization
            fig = plt.figure(figsize=(20, 15))
            gs = fig.add_gridspec(3, 4, hspace=0.3, wspace=0.3)
            
            # Large dendrogram with cut line
            ax1 = fig.add_subplot(gs[0, :])
            try:
                dend = dendrogram(linkage_matrix, ax=ax1, color_threshold=optimal_cut['height'])
                ax1.axhline(y=optimal_cut['height'], color='red', linestyle='--', linewidth=2,
                           label=f"Optimal cut (k={optimal_cut['n_clusters']})")
                ax1.set_title(f'Complete Dendrogram - {best_method.title()} Linkage', fontsize=14)
                ax1.set_xlabel('Sample Index or (Cluster Size)')
                ax1.set_ylabel('Distance')
                ax1.legend()
            except Exception as e:
                ax1.text(0.5, 0.5, f'Dendrogram visualization error: {str(e)}', 
                        ha='center', va='center', transform=ax1.transAxes)
                ax1.set_title(f'Dendrogram Error - {best_method.title()} Linkage')
            
            # Cut height analysis
            ax2 = fig.add_subplot(gs[1, 0])
            ax2.plot(cut_df['height'], cut_df['n_clusters'], 'bo-', linewidth=2)
            ax2.axvline(x=optimal_cut['height'], color='red', linestyle='--', alpha=0.7)
            ax2.set_xlabel('Cut Height')
            ax2.set_ylabel('Number of Clusters')
            ax2.set_title('Cut Height vs Number of Clusters')
            ax2.grid(True, alpha=0.3)
            
            # Silhouette vs cut height
            ax3 = fig.add_subplot(gs[1, 1])
            valid_mask = cut_df['silhouette_score'] > -1
            if np.any(valid_mask):
                ax3.plot(cut_df.loc[valid_mask, 'height'], cut_df.loc[valid_mask, 'silhouette_score'], 
                        'go-', linewidth=2)
                ax3.axvline(x=optimal_cut['height'], color='red', linestyle='--', alpha=0.7)
            ax3.set_xlabel('Cut Height')
            ax3.set_ylabel('Silhouette Score')
            ax3.set_title('Cut Height vs Silhouette Score')
            ax3.grid(True, alpha=0.3)
            
            # Distance matrix heatmap
            ax4 = fig.add_subplot(gs[1, 2])
            # Sample for visualization if too large
            if len(distance_matrix) > 100:
                sample_idx = np.random.choice(len(distance_matrix), 100, replace=False)
                sample_matrix = distance_matrix[np.ix_(sample_idx, sample_idx)]
            else:
                sample_matrix = distance_matrix
            
            im = ax4.imshow(sample_matrix, cmap='viridis', aspect='auto')
            ax4.set_title('Distance Matrix Heatmap')
            ax4.set_xlabel('Sample Index')
            ax4.set_ylabel('Sample Index')
            plt.colorbar(im, ax=ax4)
            
            # Optimal clustering result
            ax5 = fig.add_subplot(gs[1, 3])
            optimal_labels = fcluster(linkage_matrix, optimal_cut['height'], criterion='distance')
            
            if data_scaled.shape[1] > 2:
                pca = PCA(n_components=2, random_state=42)
                data_pca = pca.fit_transform(data_scaled)
            else:
                data_pca = data_scaled
            
            scatter = ax5.scatter(data_pca[:, 0], data_pca[:, 1], c=optimal_labels, 
                                cmap='viridis', alpha=0.6, s=50)
            ax5.set_xlabel('First Principal Component')
            ax5.set_ylabel('Second Principal Component')
            ax5.set_title(f'Optimal Clustering\n(k={optimal_cut["n_clusters"]}, score={optimal_cut["silhouette_score"]:.3f})')
            plt.colorbar(scatter, ax=ax5)
            
            # Linkage distance distribution
            ax6 = fig.add_subplot(gs[2, 0])
            ax6.hist(linkage_matrix[:, 2], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
            ax6.axvline(x=optimal_cut['height'], color='red', linestyle='--', linewidth=2)
            ax6.set_xlabel('Linkage Distance')
            ax6.set_ylabel('Frequency')
            ax6.set_title('Distribution of Linkage Distances')
            ax6.grid(True, alpha=0.3)
            
            # Cluster size distribution at optimal cut
            ax7 = fig.add_subplot(gs[2, 1])
            unique, counts = np.unique(optimal_labels, return_counts=True)
            ax7.bar(unique, counts, alpha=0.7, color='lightcoral', edgecolor='black')
            ax7.set_xlabel('Cluster ID')
            ax7.set_ylabel('Cluster Size')
            ax7.set_title('Cluster Size Distribution')
            ax7.grid(True, alpha=0.3)
            
            # Merge sequence analysis
            ax8 = fig.add_subplot(gs[2, 2:])
            merge_distances = linkage_matrix[:, 2]
            ax8.plot(range(len(merge_distances)), merge_distances, 'b-', linewidth=2)
            ax8.set_xlabel('Merge Step')
            ax8.set_ylabel('Merge Distance')
            ax8.set_title('Hierarchical Merge Sequence')
            ax8.grid(True, alpha=0.3)
            
            plt.suptitle(f'Comprehensive Dendrogram Analysis - {self.datasets[dataset_key]["name"]}', 
                        fontsize=16)
            plt.show()
            
            print(f"Dendrogram Analysis Results:")
            print(f"   Best linkage method: {best_method}")
            print(f"   Optimal cut height: {optimal_cut['height']:.3f}")
            print(f"   Resulting clusters: {optimal_cut['n_clusters']}")
            print(f"   Silhouette score: {optimal_cut['silhouette_score']:.3f}")
            print(f"   Distance range: {min_distance:.3f} - {max_distance:.3f}")
            
            return optimal_cut
            
        except Exception as e:
            print(f"Error in dendrogram analysis: {e}")
            return None

    def experiment_5_cross_algorithm_comparison(self, dataset_key):
        """
        Experiment 5: DBSCAN vs HAC Comparison
        Compare DBSCAN and Hierarchical clustering performance
        """
        print(f"\nEXPERIMENT 5: DBSCAN vs HAC Comparison - {self.datasets[dataset_key]['name']}")
        print("="*80)
        
        if not self.validate_dataset(dataset_key):
            print(f"Skipping algorithm comparison for {dataset_key} due to insufficient data")
            return None
        
        data = self.datasets[dataset_key]['data']
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(data)
        
        # Sample for comparison if necessary
        max_samples = 1000
        original_length = len(data_scaled)
        if len(data_scaled) > max_samples:
            sample_idx = np.random.choice(len(data_scaled), max_samples, replace=False)
            data_scaled = data_scaled[sample_idx]
        
        comparison_results = {}
        
        # DBSCAN results
        dbscan_params = self.results.get(dataset_key, {}).get('dbscan_optimization', {}).get('best_params')
        if dbscan_params:
            try:
                dbscan = DBSCAN(eps=dbscan_params['eps'], min_samples=dbscan_params['min_samples'])
                dbscan_labels = dbscan.fit_predict(data_scaled)
                
                dbscan_n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
                dbscan_n_noise = list(dbscan_labels).count(-1)
                
                # Calculate metrics excluding noise points
                non_noise_mask = dbscan_labels != -1
                if dbscan_n_clusters > 1 and np.sum(non_noise_mask) > 1:
                    try:
                        dbscan_silhouette = silhouette_score(data_scaled[non_noise_mask], 
                                                           dbscan_labels[non_noise_mask])
                    except:
                        dbscan_silhouette = -1
                else:
                    dbscan_silhouette = -1
                
                comparison_results['DBSCAN'] = {
                    'labels': dbscan_labels,
                    'n_clusters': dbscan_n_clusters,
                    'n_noise': dbscan_n_noise,
                    'silhouette_score': dbscan_silhouette,
                    'algorithm': 'Density-based',
                    'params': dbscan_params
                }
            except Exception as e:
                print(f"Error with DBSCAN: {e}")
        
        # HAC results
        hac_results = self.results.get(dataset_key, {}).get('hac_linkages', {})
        if hac_results:
            try:
                best_linkage = max(hac_results.keys(), key=lambda x: hac_results[x]['best_score'])
                best_hac = hac_results[best_linkage]
                
                hac = AgglomerativeClustering(n_clusters=best_hac['optimal_k'], linkage=best_linkage)
                hac_labels = hac.fit_predict(data_scaled)
                
                hac_silhouette = silhouette_score(data_scaled, hac_labels)
                
                comparison_results['HAC'] = {
                    'labels': hac_labels,
                    'n_clusters': best_hac['optimal_k'],
                    'n_noise': 0,  # HAC doesn't have noise points
                    'silhouette_score': hac_silhouette,
                    'algorithm': 'Hierarchical',
                    'params': {'linkage': best_linkage, 'n_clusters': best_hac['optimal_k']}
                }
            except Exception as e:
                print(f"Error with HAC: {e}")
        
        if not comparison_results:
            print("No valid algorithm results to compare")
            return None
        
        # Calculate additional comparison metrics if we have ground truth
        if 'target' in self.datasets[dataset_key] and self.datasets[dataset_key]['target'] is not None:
            target = self.datasets[dataset_key]['target']
            # Adjust target length if we sampled data
            if len(target) > len(data_scaled):
                target = target[:len(data_scaled)]
            
            if len(target) == len(data_scaled):
                for alg_name, results in comparison_results.items():
                    if results['n_clusters'] > 1:
                        try:
                            results['ari_score'] = adjusted_rand_score(target, results['labels'])
                            results['nmi_score'] = normalized_mutual_info_score(target, results['labels'])
                        except:
                            results['ari_score'] = -1
                            results['nmi_score'] = -1
        
        # Store results
        self.results[dataset_key]['algorithm_comparison'] = comparison_results
        
        # Visualization
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle(f'DBSCAN vs HAC Comparison - {self.datasets[dataset_key]["name"]}', fontsize=16)
        
        # Performance metrics comparison
        algorithms = list(comparison_results.keys())
        silhouette_scores = [comparison_results[alg]['silhouette_score'] for alg in algorithms]
        n_clusters = [comparison_results[alg]['n_clusters'] for alg in algorithms]
        n_noise = [comparison_results[alg]['n_noise'] for alg in algorithms]
        
        if algorithms:
            # Set colors based on available algorithms
            colors = ['blue', 'green'][:len(algorithms)]
            
            axes[0, 0].bar(algorithms, silhouette_scores, color=colors, alpha=0.7, edgecolor='black')
            axes[0, 0].set_ylabel('Silhouette Score')
            axes[0, 0].set_title('Silhouette Score Comparison')
            axes[0, 0].grid(True, alpha=0.3)
            
            axes[0, 1].bar(algorithms, n_clusters, color=colors, alpha=0.7, edgecolor='black')
            axes[0, 1].set_ylabel('Number of Clusters')
            axes[0, 1].set_title('Number of Clusters')
            axes[0, 1].grid(True, alpha=0.3)
            
            axes[0, 2].bar(algorithms, n_noise, color=colors, alpha=0.7, edgecolor='black')
            axes[0, 2].set_ylabel('Number of Noise Points')
            axes[0, 2].set_title('Noise Points (DBSCAN only)')
            axes[0, 2].grid(True, alpha=0.3)
        
        # Clustering visualizations
        if data_scaled.shape[1] > 2:
            pca = PCA(n_components=2, random_state=42)
            data_pca = pca.fit_transform(data_scaled)
        else:
            data_pca = data_scaled
        
        for i, (alg_name, results) in enumerate(comparison_results.items()):
            ax = axes[1, i]
            
            labels = results['labels']
            scatter = ax.scatter(data_pca[:, 0], data_pca[:, 1], c=labels, 
                               cmap='viridis', alpha=0.6, s=50)
            
            ax.set_xlabel('First Principal Component')
            ax.set_ylabel('Second Principal Component')
            ax.set_title(f'{alg_name} Clustering\n(k={results["n_clusters"]}, score={results["silhouette_score"]:.3f})')
            plt.colorbar(scatter, ax=ax)
        
        # Algorithm characteristics comparison
        if len(comparison_results) > 0:
            ax = axes[1, 2]
            characteristics = {
                'DBSCAN': ['Density-based', 'Handles noise', 'No predefined k', 'Finds arbitrary shapes'],
                'HAC': ['Hierarchy-based', 'No noise handling', 'Needs predefined k', 'Tends toward spherical']
            }
            
            y_pos = 0.8
            for alg in algorithms:
                if alg in characteristics:
                    chars = characteristics[alg]
                    ax.text(0.1, y_pos, f'{alg}:', fontweight='bold', fontsize=12, transform=ax.transAxes)
                    for j, char in enumerate(chars):
                        ax.text(0.15, y_pos - 0.1 * (j + 1), f'• {char}', fontsize=10, transform=ax.transAxes)
                    y_pos -= 0.6
            
            ax.set_xlim(0, 1)
            ax.set_ylim(0, 1)
            ax.set_title('Algorithm Characteristics')
            ax.axis('off')
        
        plt.tight_layout()
        plt.show()
        
        # Detailed comparison table
        print(f"DETAILED ALGORITHM COMPARISON")
        print("-" * 80)
        
        if len(comparison_results) >= 2:
            print(f"{'Metric':<20} {'DBSCAN':<15} {'HAC':<15}")
            print("-" * 80)
            
            metrics = ['n_clusters', 'silhouette_score', 'n_noise']
            for metric in metrics:
                dbscan_val = comparison_results.get('DBSCAN', {}).get(metric, 'N/A')
                hac_val = comparison_results.get('HAC', {}).get(metric, 'N/A')
                
                if isinstance(dbscan_val, float):
                    dbscan_str = f"{dbscan_val:.3f}"
                else:
                    dbscan_str = str(dbscan_val)
                    
                if isinstance(hac_val, float):
                    hac_str = f"{hac_val:.3f}"
                else:
                    hac_str = str(hac_val)
                
                print(f"{metric:<20} {dbscan_str:<15} {hac_str:<15}")
            
            # Add ARI and NMI if available
            if 'ari_score' in comparison_results.get('DBSCAN', {}):
                print(f"{'ari_score':<20} {comparison_results['DBSCAN']['ari_score']:<15.3f} {comparison_results['HAC']['ari_score']:<15.3f}")
                print(f"{'nmi_score':<20} {comparison_results['DBSCAN']['nmi_score']:<15.3f} {comparison_results['HAC']['nmi_score']:<15.3f}")
            
            # Determine winner
            dbscan_score = comparison_results.get('DBSCAN', {}).get('silhouette_score', -2)
            hac_score = comparison_results.get('HAC', {}).get('silhouette_score', -2)
            
            if dbscan_score > hac_score and dbscan_score > -1:
                winner = 'DBSCAN'
                margin = dbscan_score - hac_score
            elif hac_score > dbscan_score and hac_score > -1:
                winner = 'HAC'
                margin = hac_score - dbscan_score
            else:
                winner = 'Tie/No valid scores'
                margin = 0
            
            print(f"\nWinner: {winner} (margin: {margin:.3f})")
        else:
            print("Insufficient results for comparison")
        
        return comparison_results

    def run_all_experiments(self):
        """Run the complete experimental suite for DBSCAN and HAC"""
        print("STARTING COMPREHENSIVE DBSCAN & HAC EXPERIMENTAL SUITE")
        print("="*90)
        
        # Load datasets
        datasets = self.load_datasets()
        
        if not datasets:
            print("ERROR: No datasets loaded. Please check file paths.")
            return
        
        # Run experiments for each dataset
        for dataset_key in datasets.keys():
            print(f"\nANALYZING DATASET: {datasets[dataset_key]['name']}")
            print("="*90)
            
            try:
                # Run all experiments
                self.experiment_1_dbscan_parameter_optimization(dataset_key)
                self.experiment_2_density_analysis(dataset_key)
                self.experiment_3_hierarchical_clustering_linkages(dataset_key)
                self.experiment_4_dendrogram_analysis(dataset_key)
                self.experiment_5_cross_algorithm_comparison(dataset_key)
                
            except Exception as e:
                print(f"ERROR in experiments for {dataset_key}: {e}")
                import traceback
                traceback.print_exc()
                continue
        
        # Final summary
        self.generate_final_summary()

    def generate_final_summary(self):
        """Generate comprehensive summary of all experiments"""
        print(f"\nCOMPREHENSIVE EXPERIMENTAL SUMMARY")
        print("="*90)
        
        for dataset_key, dataset_results in self.results.items():
            dataset_name = self.datasets[dataset_key]['name']
            print(f"\n{dataset_name}")
            print("-" * 60)
            
            # DBSCAN results
            if 'dbscan_optimization' in dataset_results:
                dbscan_params = dataset_results['dbscan_optimization']['best_params']
                if dbscan_params:
                    print(f"   DBSCAN Optimal Parameters:")
                    print(f"     Eps: {dbscan_params['eps']:.3f}")
                    print(f"     Min_samples: {dbscan_params['min_samples']}")
            
            # Density analysis
            if 'density_analysis' in dataset_results:
                density_info = dataset_results['density_analysis']
                print(f"   Density Analysis:")
                print(f"     Clusters: {density_info['n_clusters']}")
                print(f"     Core points: {density_info['n_core']}")
                print(f"     Noise points: {density_info['n_noise']}")
            
            # HAC results
            if 'hac_linkages' in dataset_results:
                hac_results = dataset_results['hac_linkages']
                if hac_results:
                    best_linkage = max(hac_results.keys(), key=lambda x: hac_results[x]['best_score'])
                    best_score = hac_results[best_linkage]['best_score']
                    optimal_k = hac_results[best_linkage]['optimal_k']
                    print(f"   HAC Best Results:")
                    print(f"     Linkage method: {best_linkage}")
                    print(f"     Optimal k: {optimal_k}")
                    print(f"     Silhouette score: {best_score:.3f}")
            
            # Algorithm comparison
            if 'algorithm_comparison' in dataset_results:
                comparison = dataset_results['algorithm_comparison']
                print(f"   Algorithm Winner:")
                if 'DBSCAN' in comparison and 'HAC' in comparison:
                    dbscan_score = comparison['DBSCAN']['silhouette_score']
                    hac_score = comparison['HAC']['silhouette_score']
                    winner = 'DBSCAN' if dbscan_score > hac_score else 'HAC'
                    print(f"     {winner} (DBSCAN: {dbscan_score:.3f}, HAC: {hac_score:.3f})")
        
        print(f"\nDBSCAN & HAC EXPERIMENTAL SUITE COMPLETE!")
        print("="*90)


# Usage Example
if __name__ == "__main__":
    # Create experiment suite
    experiment_suite = DBSCANHACExperimentSuite(
        eps_range=(0.1, 2.0),
        eps_steps=20,
        min_samples_range=(3, 15)
    )
    
    # Run all experiments
    experiment_suite.run_all_experiments()
    
    # Access results programmatically
    print("\nResults stored in experiment_suite.results for further analysis")
    
    # Example: Access specific results
    #dbscan_params = experiment_suite.results['collision']['dbscan_optimization']['best_params']
    #hac_results = experiment_suite.results['collision']['hac_linkages']
STARTING COMPREHENSIVE DBSCAN & HAC EXPERIMENTAL SUITE
==========================================================================================
Loading datasets...
Collision dataset loaded: (232, 23)
Injury prediction dataset loaded: (5000, 86)
Big Data Bowl raw data shape: (5863, 120)
Numeric columns found: 120
After removing constant/NaN columns: (5863, 120)
After removing NaN rows: (5863, 120)
Final sampled data shape: (3000, 120)
Target variable loaded: 3000 values
Big Data Bowl dataset loaded successfully: (3000, 120)
Successfully loaded 3 datasets

ANALYZING DATASET: NFL Collision Analysis
==========================================================================================

EXPERIMENT 1: DBSCAN Parameter Optimization - NFL Collision Analysis
================================================================================
Data shape for DBSCAN: (232, 23)
Computing k-distance graph for eps estimation...
Grid search for optimal DBSCAN parameters...
No description has been provided for this image
No valid clustering found in parameter space

EXPERIMENT 2: Density Analysis - NFL Collision Analysis
================================================================================
Using default parameters: eps=0.5, min_samples=5
Analyzing point densities and core points...
No description has been provided for this image
Density Analysis Results:
   Total points: 232
   Core points: 0 (0.0%)
   Border points: 0 (0.0%)
   Noise points: 232 (100.0%)
   Number of clusters: 0

EXPERIMENT 3: Hierarchical Clustering Linkages - NFL Collision Analysis
================================================================================
Testing different linkage methods...
   Testing ward linkage...
   Testing complete linkage...
   Testing average linkage...
   Testing single linkage...
No description has been provided for this image
Linkage Method Performance Comparison:
------------------------------------------------------------
Method       Optimal k  Best Silhouette
------------------------------------------------------------
Ward         2          0.137          
Complete     3          0.444          
Average      2          0.497          
Single       2          0.497          

Best linkage method: Average (score: 0.497)

EXPERIMENT 4: Dendrogram Analysis - NFL Collision Analysis
================================================================================
Using average linkage for detailed analysis...
No description has been provided for this image
Dendrogram Analysis Results:
   Best linkage method: average
   Optimal cut height: 9.495
   Resulting clusters: 3.0
   Silhouette score: 0.444
   Distance range: 1.716 - 13.084

EXPERIMENT 5: DBSCAN vs HAC Comparison - NFL Collision Analysis
================================================================================
No description has been provided for this image
DETAILED ALGORITHM COMPARISON
--------------------------------------------------------------------------------
Insufficient results for comparison

ANALYZING DATASET: NFL Injury Prediction
==========================================================================================

EXPERIMENT 1: DBSCAN Parameter Optimization - NFL Injury Prediction
================================================================================
Warning: Dataset injury_prediction has features with very low variance
Data shape for DBSCAN: (5000, 86)
Computing k-distance graph for eps estimation...
Grid search for optimal DBSCAN parameters...
No description has been provided for this image
No valid clustering found in parameter space

EXPERIMENT 2: Density Analysis - NFL Injury Prediction
================================================================================
Warning: Dataset injury_prediction has features with very low variance
Using default parameters: eps=0.5, min_samples=5
Analyzing point densities and core points...
No description has been provided for this image
Density Analysis Results:
   Total points: 5000
   Core points: 0 (0.0%)
   Border points: 0 (0.0%)
   Noise points: 5000 (100.0%)
   Number of clusters: 0

EXPERIMENT 3: Hierarchical Clustering Linkages - NFL Injury Prediction
================================================================================
Warning: Dataset injury_prediction has features with very low variance
Sampling 1000 points for hierarchical clustering (original: 5000)
Testing different linkage methods...
   Testing ward linkage...
   Testing complete linkage...
   Testing average linkage...
   Testing single linkage...
No description has been provided for this image
Linkage Method Performance Comparison:
------------------------------------------------------------
Method       Optimal k  Best Silhouette
------------------------------------------------------------
Ward         3          0.050          
Complete     2          0.711          
Average      2          0.711          
Single       2          0.711          

Best linkage method: Complete (score: 0.711)

EXPERIMENT 4: Dendrogram Analysis - NFL Injury Prediction
================================================================================
Warning: Dataset injury_prediction has features with very low variance
Sampling 500 points for detailed dendrogram analysis
Using complete linkage for detailed analysis...
No description has been provided for this image
Dendrogram Analysis Results:
   Best linkage method: complete
   Optimal cut height: 28.482
   Resulting clusters: 2.0
   Silhouette score: 0.621
   Distance range: 3.853 - 39.451

EXPERIMENT 5: DBSCAN vs HAC Comparison - NFL Injury Prediction
================================================================================
Warning: Dataset injury_prediction has features with very low variance
No description has been provided for this image
DETAILED ALGORITHM COMPARISON
--------------------------------------------------------------------------------
Insufficient results for comparison

ANALYZING DATASET: NFL Big Data Bowl
==========================================================================================

EXPERIMENT 1: DBSCAN Parameter Optimization - NFL Big Data Bowl
================================================================================
Data shape for DBSCAN: (3000, 120)
Computing k-distance graph for eps estimation...
Grid search for optimal DBSCAN parameters...
No description has been provided for this image
No valid clustering found in parameter space

EXPERIMENT 2: Density Analysis - NFL Big Data Bowl
================================================================================
Using default parameters: eps=0.5, min_samples=5
Analyzing point densities and core points...
No description has been provided for this image
Density Analysis Results:
   Total points: 3000
   Core points: 0 (0.0%)
   Border points: 0 (0.0%)
   Noise points: 3000 (100.0%)
   Number of clusters: 0

EXPERIMENT 3: Hierarchical Clustering Linkages - NFL Big Data Bowl
================================================================================
Sampling 1000 points for hierarchical clustering (original: 3000)
Testing different linkage methods...
   Testing ward linkage...
   Testing complete linkage...
   Testing average linkage...
   Testing single linkage...
No description has been provided for this image
Linkage Method Performance Comparison:
------------------------------------------------------------
Method       Optimal k  Best Silhouette
------------------------------------------------------------
Ward         2          0.486          
Complete     2          0.522          
Average      2          0.522          
Single       2          0.522          

Best linkage method: Complete (score: 0.522)

EXPERIMENT 4: Dendrogram Analysis - NFL Big Data Bowl
================================================================================
Sampling 500 points for detailed dendrogram analysis
Using complete linkage for detailed analysis...
No description has been provided for this image
Dendrogram Analysis Results:
   Best linkage method: complete
   Optimal cut height: 39.831
   Resulting clusters: 2.0
   Silhouette score: 0.737
   Distance range: 3.439 - 62.787

EXPERIMENT 5: DBSCAN vs HAC Comparison - NFL Big Data Bowl
================================================================================
No description has been provided for this image
DETAILED ALGORITHM COMPARISON
--------------------------------------------------------------------------------
Insufficient results for comparison

COMPREHENSIVE EXPERIMENTAL SUMMARY
==========================================================================================

NFL Collision Analysis
------------------------------------------------------------
   Density Analysis:
     Clusters: 0
     Core points: 0
     Noise points: 232
   HAC Best Results:
     Linkage method: average
     Optimal k: 2
     Silhouette score: 0.497
   Algorithm Winner:

NFL Injury Prediction
------------------------------------------------------------
   Density Analysis:
     Clusters: 0
     Core points: 0
     Noise points: 5000
   HAC Best Results:
     Linkage method: complete
     Optimal k: 2
     Silhouette score: 0.711
   Algorithm Winner:

NFL Big Data Bowl
------------------------------------------------------------
   Density Analysis:
     Clusters: 0
     Core points: 0
     Noise points: 3000
   HAC Best Results:
     Linkage method: complete
     Optimal k: 2
     Silhouette score: 0.522
   Algorithm Winner:

DBSCAN & HAC EXPERIMENTAL SUITE COMPLETE!
==========================================================================================

Results stored in experiment_suite.results for further analysis